Why do we like some songs more than others? Is there something about a song that pleases out subconscious, making us listening to it on repeat? To understand this I collected various attributes from a selection of songs available in the Spotify's playlist "All out ..s" starting from the 50s up to the newly ended 10s. Can you find the secret sauce to make a song popular? Content This data repo contains 7 datasets (.csv files), each representing a Spotify's "All out ..s" type of playlist. Those playlists collect the most popular/iconic songs from the decade. For each song, a set of attributes have been reported in order to perform some data analysis. The attributes have been scraped from this amazing website. In particular, according to the website the attributes are: • top genre: genre of the song • year: year of the song (due to re-releases, the year might not correspond to the release year of the original song) • bpm(beats per minute): beats per minute • nrgy(energy): energy of a song, the higher the value the more energetic the song is • dnce(danceability): the higher the value, the easier it is to dance to this song. • dB(loudness): the higher the value, the louder the song. • live(liveness): the higher the value, the more likely the song is a live recording. • val(valence): the higher the value, the more positive mood for the song. • dur(duration): the duration of the song. • acous(acousticness): the higher the value the more acoustic the song is. • spch(speechiness): the higher the value the more spoken word the song contains. • pop(popularity): the higher the value the more popular the song is.
In this first step we gonna load the needed libraries and load our datasets
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from numpy import arange
import os
import pandas as pd
from pandas import read_csv
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#load 7 CSV files
df_1950=pd.read_csv('datasets_492577_915991_1950.csv')
df_1960=pd.read_csv('datasets_492577_915991_1960.csv')
df_1970=pd.read_csv('datasets_492577_915991_1970.csv')
df_1980=pd.read_csv('datasets_492577_915991_1980.csv')
df_1990=pd.read_csv('datasets_492577_915991_1990.csv')
df_2000=pd.read_csv('datasets_492577_915991_2000.csv')
df_2010=pd.read_csv('datasets_492577_915991_2010.csv')
a) Descriptive statistics b) Data visualization
nRowsRead=1000
nRow,nCol=df_1950.shape
print(f'There are {nRow} rows and {nCol} columns in df_1950')
df_1950.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_1950=df_1950.astype(convert_dict)
print(df_1950.dtypes)
df_1950.isnull().sum()
df_1950.describe()
#Let's calculate the correlations between attributes
df_1950.corr(method='pearson')
nrgy is highly correlated to dB, val and acous dnce is highly correlated to val dB is highly correlated to nrgy
When it will come to feature selection we will youse the Variance Inflation Factor to select the best features. The correlation show us that the correlated variables might not be used together in the model.
#Let's take a look at the skewness
df_1950.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: dnce, dB, val, dur
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:bpm, nrgy,pop
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live,acous,spch
import seaborn as sns
sns.set()
sns.kdeplot(df_1950['live'], shade=True)
sns.distplot(df_1950['live']);
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
nunique = df.nunique()
df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
nRow, nCol = df.shape
columnNames = list(df)
nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
for i in range(min(nCol, nGraphShown)):
plt.subplot(nGraphRow, nGraphPerRow, i + 1)
columnDf = df.iloc[:, i]
if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
valueCounts = columnDf.value_counts()
valueCounts.plot.bar()
else:
columnDf.hist()
plt.ylabel('counts')
plt.xticks(rotation = 90)
plt.title(f'{columnNames[i]} (column {i})')
plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
plt.show()
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
filename = df
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
if df.shape[1] < 2:
print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
return
corr = df.corr()
plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum = 1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.title(f'Correlation Matrix for {filename}', fontsize=15)
plt.show()
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
df = df.select_dtypes(include =[np.number]) # keep only numerical columns
# Remove rows and columns that would lead to df being singular
#df = df.dropna('columns')
df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
columnNames = list(df)
if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
columnNames = columnNames[:10]
df = df[columnNames]
ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
corrs = df.corr().values
for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
plt.suptitle('Scatter and Density Plot')
plt.show()
plotPerColumnDistribution(df_1950,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_1950, 8)
#Scatter and density plots
plotScatterMatrix(df_1950, plotSize=20, textSize=10)
from pandas import read_csv
from matplotlib import pyplot
#filename='datasets_492577_915991_1950.csv'
#names=['bpm','nrgy','dnce','db','live','val','dur','acous','spch','pop']
#data_50=read_csv(filename,names=names)
df_1950[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4),sharex=False,sharey=False)
pyplot.show()
nRowsRead=1000
nRow,nCol=df_1960.shape
print(f'There are {nRow} rows and {nCol} columns in df_1960')
df_1960.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_1960=df_1960.astype(convert_dict)
print(df_1960.dtypes)
df_1960.isnull().sum()
df_1960.describe()
df_1960.corr()
bpm is moderately correlated to accous, nrgy is highly correlated to dB,val and acous dnce ishighly correlated to val
When it will come to feature selection we will youse the Variance Inflation Factor to select the best features. The correlation show us that the correlated variables might not be used together in the model.
#let's look at the skewness
df_1960.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, val, acous, pop
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:dB
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, dur, spch
plotPerColumnDistribution(df_1960,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_1960, 8)
#Scatter and density plots
plotScatterMatrix(df_1960, plotSize=20, textSize=10)
from pandas import read_csv
from matplotlib import pyplot
df_1960[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
nRowsRead=1000
nRow,nCol=df_1970.shape
print(f'There are {nRow} rows and {nCol} columns in df_1970')
df_1970.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_1970=df_1970.astype(convert_dict)
print(df_1970.dtypes)
df_1970.isnull().sum()
df_1970.describe()
df_1970.corr()
nrgy is highly correlated with dB, with acous and moderately correlated with spch dnce is highly correlated with val val is moderately correlated with acous
# let's take a look at the skewness
df_1970.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, val, pop
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:dB,acous
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, dur, spch
plotPerColumnDistribution(df_1970,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_1970, 8)
#Scatter and density plots
plotScatterMatrix(df_1970, plotSize=20, textSize=10)
from pandas import read_csv
from matplotlib import pyplot
df_1970[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
nRowsRead=1000
nRow,nCol=df_1980.shape
print(f'There are {nRow} rows and {nCol} columns in df_1980')
df_1980.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_1980=df_1980.astype(convert_dict)
print(df_1980.dtypes)
df_1980.isnull().sum()
df_1980.describe()
df_1980.corr()
nrgy is highly correlated with db and val dnce is highly correlated with val
For this decade there are not too much correlated variables
# let's check the skewness
df_1980.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, val, pop
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:dB,acous
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, dur, spch
plotPerColumnDistribution(df_1980,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_1980, 8)
#Scatter and density plots
plotScatterMatrix(df_1980, plotSize=20, textSize=10)
from pandas import read_csv
from matplotlib import pyplot
df_1980[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
nRowsRead=1000
nRow,nCol=df_1990.shape
print(f'There are {nRow} rows and {nCol} columns in df_1990')
df_1990.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_1990=df_1990.astype(convert_dict)
print(df_1990.dtypes)
df_1990.isnull().sum()
df_1990.describe()
df_1990.corr()
nrgy is highly correlated to val and acous dnce is highly correlated to val val is moderately correlated to dur and acous
#let's take a look at the skewness
df_1990.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, dB, val, pop
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:pop
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, dur, acous, spch
plotPerColumnDistribution(df_1990,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_1990, 8)
#Scatter and density plots
plotScatterMatrix(df_1990, plotSize=20, textSize=10)
from pandas import read_csv
from matplotlib import pyplot
df_1990[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
nRowsRead=1000
nRow,nCol=df_2000.shape
print(f'There are {nRow} rows and {nCol} columns in df_2000')
df_2000.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_2000=df_2000.astype(convert_dict)
print(df_2000.dtypes)
df_2000.isnull().sum()
df_2000.describe()
df_2000.corr()
nrgy is highly correlated with dB dnce is moderately correlated with val
#let's verify the skewness
df_2000.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, dB, val, dur
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, acous, pop, spch
plotPerColumnDistribution(df_2000,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_2000, 8)
#Scatter and density plots
plotScatterMatrix(df_2000, plotSize=20, textSize=10)
from matplotlib import pyplot
df_2000[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
nRowsRead=1000
nRow,nCol=df_2010.shape
print(f'There are {nRow} rows and {nCol} columns in df_2010')
df_2010.info()
#let's change year to object since we plan to transform the year into a categorical variable
convert_dict={'year':object}
df_2010=df_2010.astype(convert_dict)
print(df_2010.dtypes)
df_2010.isnull().sum()
df_2010.describe()
df_2010.corr()
nrgy is highly correlated with dB and acous while moderately correlated with val
#let's verify the skewness
df_2010.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, val
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed:nrgy, dnce, dur
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: dB, live, acous, spch, pop
plotPerColumnDistribution(df_2010,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df_2010, 8)
#Scatter and density plots
plotScatterMatrix(df_2010, plotSize=20, textSize=10)
from matplotlib import pyplot
df_2010[['bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']].plot(kind='box',subplots=True,layout=(3,4))
pyplot.show()
#let's transform the values of column 'year'
df_1950['year']='fifties'
df_1950.head()
df_1960['year']='sixties'
df_1970['year']='seventies'
df_1980['year']='eighties'
df_1990['year']='nineties'
df_2000['year']='twenties'
df_2010['year']='twenties_10'
df_1980.head()
after reviewing the datasets, we observe that none of them have duplicated rows.
# calculate duplicate
dups=df_2010.duplicated()
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(df_2010[dups])
df_1950_duplicate=df_1950[df_1950.duplicated()]
print('Duplicate Rows:')
df_1950_duplicate
df_2010_duplicate=df_2010[df_2010.duplicated()]
print('Duplicate Rows:')
df_2010_duplicate
df=pd.concat([df_1950,df_1960,df_1970,df_1980,df_1990,df_2000,df_2010],axis=0,ignore_index=True)
nRowsRead=1000
nRow,nCol=df.shape
print(f'There are {nRow} rows and {nCol} columns in df')
df.info()
df.isnull().sum()
# calculate duplicate
dupl=df.duplicated()
# report if there are any duplicates
print(dupl.any())
# list all duplicate rows
print(df[dupl])
We observe that there are no duplicated rows in our new dataset df
df.describe()
df.corr()
nrgy is highly correlated with dB and acous dnce is moderately correlated with val dB is moderately correlated with acous
# let's check the skewness
df.skew()
If the skewness is between -0.5 and 0.5, the data are fairly symmetrical: bpm, nrgy, dnce, val
If the skewness is between -1 and -0.5(negatively skewed) or between 0.5 and 1(positively skewed), the data are moderately skewed: dB, dur, acous, pop
If the skewness is less than -1(negatively skewed) or greater than 1(positively skewed), the data are highly skewed: live, spch
plotPerColumnDistribution(df,nGraphShown=15,nGraphPerRow=3)
plotCorrelationMatrix(df, 8)
#Scatter and density plots
plotScatterMatrix(df, plotSize=20, textSize=10)
# we will first set up a new data frame with the features which will be used in the model
df_new=df.drop(['Number','title','artist'], axis=1)
df_new.head()
df_new.columns
df_new.shape
(df_new['top genre']=='poppop').unique()
df_new['top genre'].unique()
df_new.replace(to_replace=['deep adult standards','adult standards'],value=['adult','adult'],inplace=True)
df_new.replace(to_replace=['brill building pop', 'doo-wop', 'dance pop','classic uk pop', 'baroque pop',
'art pop', 'bebop', 'bubblegum pop', 'afropop', 'pop',
'europop', 'classic country pop', 'new wave pop', 'bow pop', 'classic danish pop',
'britpop',
'canadian pop', 'italian pop', 'barbadian pop', 'electropop', 'belgian pop',
'hip pop', 'irish pop', 'australian pop'], value=['pop','pop','pop','pop','pop','pop','pop','pop','pop','pop',
'pop','pop','pop','pop','pop','pop','pop','pop',
'pop','pop','pop','pop','pop','pop'],inplace=True)
df_new.replace(to_replace=['blues rock', 'art rock', 'rock-and-roll', 'chanson','australian rock', 'classic rock',
'glam rock', 'soft rock', 'country rock', 'german alternative rock', 'celtic rock',
'album rock', 'modern rock', 'alternative rock','dance rock'],value=['rock','rock','rock','rock','rock',
'rock','rock','rock','rock','rock',
'rock','rock','rock','rock','rock'], inplace=True)
df_new.replace(to_replace=['classic soul', 'classic girl group'],value=['classic','classic'],inplace=True)
df_new.replace(to_replace=['acoustic blues', 'blues', 'british blues'],value=['blues','blues','blues'],inplace=True)
df_new.replace(to_replace=['canadian folk', 'american folk revival', 'appalachian folk',
'drone folk', 'british folk'],value=['folk','folk','folk','folk','folk'],inplace=True)
df_new.replace(to_replace=['hip hop', 'canadian hip hop', 'conscious hip hop', 'detroit hip hop','bronx hip hop', 'atl hip hop', 'east coast hip hop',],
value=['hip hop','hip hop','hip hop','hip hop','hip hop','hip hop','hip hop'], inplace=True)
df_new.replace(to_replace=['alternative country', 'country'],value=['country','country'],inplace=True)
df_new.replace(to_replace=['british dance band', 'boy band'],value=['band','band'],inplace=True)
df_new.replace(to_replace=['brit funk', 'g funk'],value=['funk','funk'],inplace=True)
df_new.replace(to_replace=['chicago rap', 'country rap', 'dfw rap', 'emo rap', 'dirty south rap'],
value=['rap','rap','rap','rap','rap'],inplace=True)
df_new.replace(to_replace=['alternative r&b','canadian contemporary r&b'],value=['r&b','r&b'],inplace=True)
df_new.replace(to_replace=['deep house', 'disco house', 'electro house', 'disco','electronic trap','chicago soul','avant-garde jazz'],
value=['house','house','house','house','house','house','house'],inplace=True)
df_new.replace(to_replace=['german dance', 'bubble trance', 'belgian dance','bubblegum dance','eurodance'],
value=['dance','dance','dance','dance','dance'],inplace=True)
df_new.replace(to_replace=['british comedy','merseybeat', 'yodeling','british invasion','beach music','cowboy western',
'boogaloo','afrobeat', 'australian talent show','hollywood','native american', 'glam metal',
'mellow gold','hi-nrg', 'glam punk','neo mellow', 'big beat', 'alternative metal',
'permanent wave','latin','big room','uk garage','brostep','complextro', 'edm','indie poptimism','aussietronica','jazz'],
value=['other','other','other','other','other','other','other','other','other',
'other','other','other','other','other','other','other','other','other',
'other','other','other','other','other','other','other','other','other','other'],inplace=True)
#let's impute both numerical and categorical variables from df ['top genre','bpm','nrgy','dnce','dB','live','val','dur','acous','spch','pop']
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def __init__(self):
"""Impute missing values.
Columns of dtype object are imputed with the most frequent value
in column.
Columns of other types are imputed with mean of column.
"""
def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.fill)
X=df_new
xt= DataFrameImputer().fit_transform(X)
print('before...')
print(X)
print('after...')
print(xt)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)
xt.head()
xt.describe()
xt.loc[xt['pop']<66,'pop']=0
#xt.loc[(xt['pop']>=50) & (xt['pop'] <72),'pop']=1
xt.loc[xt['pop']>=66,'pop']=1
xt
#xt.loc[xt['pop']<50,'pop']=0
#xt.loc[(xt['pop']>=50) & (xt['pop'] <72),'pop']=1
#xt.loc[xt['pop']>=50,'pop']=1
#xt
print(xt.groupby('pop').size())
df_new.isnull().sum()
xt.isnull().sum()
xt['top genre'].value_counts().to_frame()
xt.replace(to_replace=['other', 'classic', 'band', 'folk',
'country', 'blues', 'house', 'funk', 'dance', 'hip hop',
'british soul','rap', 'r&b'],
value=['others','others','others','others','others','others','others','others',
'others','others','others','others','others'],inplace=True)
xt['top genre'].unique()
xt['top genre'].value_counts().to_frame()
xt.head()
in our case we decide to use SUpervised selection, in a sense that we will use the target variable (pop for popularity), such as methods that remove irrelevant variables.
As we can see in our transformed dataset xt, we have only 12 features, which mean we probably don't need to make feature selection, however, the categorical values such as top genre and year, will need to be encode which will raise the number of features. We think it is important to first encode those 2 variables and scaled our data and perform feature selection after while.
We gonna encode the categorical variables and scale the numerical variables.
We will use a validation hold-out set. This is a sample of the data that we hold back from our analysis and modeling. We will use it at the end of our project to confirm the accuracy of our final model. It is a smoke test data we can use to see if we messed up and give us confidence on our estimates of accuracy on unseen data. We will use 80% of the dataset for modeling and hold back 20% for validation.
# Split-out validation dataset
#array=xt.values
X=xt.iloc[:,0:11]
Y=xt.iloc[:,11]
validation_size=0.2
seed=7
X_train, X_validation, Y_train, Y_validation=train_test_split(X, Y, test_size=validation_size, random_state=seed)
print(X_train.shape,Y_train.shape,X_validation.shape,Y_validation.shape)
X_train
Y_train
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Define which columns should be encoded vs scaled
columns_to_encode = ['top genre']
columns_to_scale = ['bpm', 'nrgy','dnce','dB','live','val','dur','acous','spch']
# Instantiate encoder/scaler
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)
# Scale and Encode Separate Columns
scaled_columns = scaler.fit_transform(X_train[columns_to_scale])
encoded_columns = ohe.fit_transform(X_train[columns_to_encode])
# Concatenate (Column-Bind) Processed Columns Back Together
X_train_processed = np.concatenate([scaled_columns, encoded_columns], axis=1)
from sklearn.preprocessing import RobustScaler, OneHotEncoder
# Define which columns should be encoded vs scaled
columns_to_encode = ['top genre']
columns_to_scale = ['bpm', 'nrgy','dnce','dB','live','val','dur','acous','spch']
# Instantiate encoder/scaler
robust = RobustScaler()
ohe = OneHotEncoder(sparse=False)
# Scale and Encode Separate Columns
scaled_columns = robust.fit_transform(X_train[columns_to_scale])
encoded_columns = ohe.fit_transform(X_train[columns_to_encode])
# Concatenate (Column-Bind) Processed Columns Back Together
X_train_processed = np.concatenate([scaled_columns, encoded_columns], axis=1)
# try Normal scaler
#from sklearn.preprocessing import RobustScaler
#transformer = RobustScaler().fit(X)
#transformer
#RobustScaler()
scaled_columns.shape
print(X_train_processed[0])
X_train_processed
X_train_processed_df=pd.DataFrame(X_train_processed)
X_train_processed_df.head()
X_train_processed.shape
X_validation.shape
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Define which columns should be encoded vs scaled
columns_to_encode = ['top genre']
columns_to_scale = ['bpm', 'nrgy','dnce','dB','live','val','dur','acous','spch']
# Instantiate encoder/scaler
#scaler = StandardScaler()
#ohe = OneHotEncoder(sparse=False)
# Scale and Encode Separate Columns
scaled_columns = scaler.transform(X_validation[columns_to_scale])
encoded_columns = ohe.transform(X_validation[columns_to_encode])
# Concatenate (Column-Bind) Processed Columns Back Together
X_validation_processed = np.concatenate([scaled_columns, encoded_columns], axis=1)
#from sklearn.preprocessing import RobustScaler, OneHotEncoder
# Define which columns should be encoded vs scaled
columns_to_encode = ['top genre']
columns_to_scale = ['bpm', 'nrgy','dnce','dB','live','val','dur','acous','spch']
# Instantiate encoder/scaler
#scaler = StandardScaler()
#ohe = OneHotEncoder(sparse=False)
# Scale and Encode Separate Columns
scaled_columns = robust.transform(X_validation[columns_to_scale])
encoded_columns = ohe.transform(X_validation[columns_to_encode])
# Concatenate (Column-Bind) Processed Columns Back Together
X_validation_processed = np.concatenate([scaled_columns, encoded_columns], axis=1)
ohe.categories_
X_validation_processed
X_validation_processed.shape
Y_train_enc, Y_validation_enc=Y_train.astype(int), Y_validation.astype(int)
Y_train_enc
#prepare target
#from sklearn.preprocessing import LabelEncoder
#from sklearn.preprocessing import OrdinalEncoder
#def prepare_targets(Y_train,Y_validation):
# le=LabelEncoder()
# le.fit(Y_train)
# Y_train_enc=le.transform(Y_train)
#Y_validation_enc=le.transform(Y_validation)
# return Y_train_enc,Y_validation_enc
Y_train_enc, Y_validation_enc=prepare_targets(Y_train,Y_validation)
Y_train_enc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
# feature selection
def select_features(X_train,Y_train,X_validation):
# configure to select all features
fs=SelectKBest(score_func=f_classif, k='all')
# learn relationship from training data
fs.fit(X_train,Y_train)
# Transform train input data
X_train_fs=fs.transform(X_train)
# Transform validation input data
X_validation_fs=fs.transform(X_validation)
return X_train_fs, X_validation_fs, fs
# feature selection
X_train_fs, X_validation_fs, fs=select_features(X_train_processed,Y_train_enc,X_validation_processed)
# what are the scores for the features
for i in range(len(fs.scores_)):
print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()
Based on the scores of each feature, we will rank them and decide which one to be chosed:
All features with a score less than 3 will not be selected, the following features will be dropped:
We will verify to which variables they correspond.
In order to select the most relevant feature in predicting our target variable 'popularity', we will use the Recursive Feature Elimination. As we aim to automatically select the number of Features chosen by RFE, we will perform cross-validation evaluation of different numbers of features, and automatically selecting the number of features that resulted in the best mean score. We will refer to the RFECV class.
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from numpy import mean
from numpy import std
# create pipeline
#rfe=RFECV(estimator=DecisionTreeClassifier())
#model= DecisionTreeClassifier()
#pipeline=Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model
#cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)
#n_scores=cross_val_score(pipeline,X_train_processed,Y_train_enc,scoring='accuracy',cv=cv,n_jobs=-1)
# report performance
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
# report which features were selected by RFE
rfe=RFECV(estimator=RandomForestClassifier())
# fit RFECV
rfe.fit(X_train_processed, Y_train_enc)
# summarize all features
for i in range(X_train_processed.shape[1]):
print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))
# logistic regression for feature importance
import matplotlib.pyplot as plt
modelf=LogisticRegression()
# fit the model
modelf.fit(X_train_processed,Y_train_enc)
#get importance
importance=modelf.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
# random forest for feature importance on a classification problem
modelrf=RandomForestClassifier()
# fit the model
modelrf.fit(X_train_processed,Y_train_enc)
#get importance
importance=modelrf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
X_train_selected=X_train_processed[:,[0,1,2,3,4,5,6,7,8,9,10,11,12]]
#X_train_selected=X_train_processed[:,[0,1,2,3,5,6,7,8,9,15,16,18,20,22,23,24]]
X_validation_selected=X_validation_processed[:,[0,1,2,3,4,5,6,7,8,9,10,11,12]]
#X_validation_selected=X_validation_processed[:,[0,1,2,3,5,6,7,8,9,15,16,18,20,22,23,24]]
since we have a relatively small dataset the k-fold cross-validation can give a robust estimate of performance compared to the train and test split.
#X_train_selected=X_train_processed
#X_validation_selected=X_validation_processed
# let's merge train and test in one set
X_processed=np.concatenate((X_train_selected,X_validation_selected),axis=0)
Y_processed=np.concatenate((Y_train_enc,Y_validation_enc),axis=0)
# let's merge train and test in one set
#X_processed=np.concatenate((X_train_processed,X_validation_processed),axis=0)
#Y_processed=np.concatenate((Y_train_enc,Y_validation_enc),axis=0)
print(X_processed.shape)
print(Y_processed.shape)
from sklearn.svm import SVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_processed, Y_processed)
GridSearchCV(estimator=SVC(),
param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
for _ in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
scores.append(accuracy)
return scores
# zero rule algorithm for classification
def zero_rule_algorithm_classification(train, test):
output_values = [row[-1] for row in train]
prediction = max(set(output_values), key=output_values.count)
predicted = [prediction for i in range(len(test))]
return predicted
# Test cross validation test harness
seed(1)
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm(dataset, zero_rule_algorithm_classification, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/len(scores)))
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
kfold=KFold(n_splits=10, random_state=7, shuffle=True)
model=LogisticRegression(solver='liblinear')
results=cross_val_score(model,X_processed,Y_processed, cv=kfold)
print('Accuracy: %.3f%% (%.3f%%)' % (results.mean()*100.0, results.std()*100.0))
model1=RandomForestClassifier()
results=cross_val_score(model1,X_processed,Y_processed, cv=kfold)
print('Accuracy: %.3f%% (%.3f%%)' % (results.mean()*100.0, results.std()*100.0))
model1.fit(X_train_processed,Y_train_enc)
result=model1.score(X_validation_processed,Y_validation_enc)
print('Accuracy: %.3f%% ' % (result*100.0))
Classification accuracy is a ratio of the number of correct predictions out of all predictions that were made.
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
kfold=KFold(n_splits=10, random_state=7, shuffle=True)
#model=LogisticRegression(solver='liblinear')
scoring='accuracy'
results=cross_val_score(model1,X_processed,Y_processed, cv=kfold,scoring=scoring)
print('Accuracy: %.3f (%.3f)' % (results.mean(), results.std()))
A performance metric for evaluationg the predictions of probabilities of membership to a given class. The scalar probability between 0 and 1 can be seen as a measure of confidence for a prediction by an algorithm. Predictions that are correct or incorrect are rewarded or punished proportionally to the confidence of the prediction.
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
kfold=KFold(n_splits=10, random_state=7, shuffle=True)
#model=LogisticRegression(solver='liblinear')
scoring='neg_log_loss'
results=cross_val_score(model1,X_processed,Y_processed, cv=kfold,scoring=scoring)
print('Logloss: %.3f (%.3f)' % (results.mean(), results.std()))
from sklearn.metrics import confusion_matrix
#model=LogisticRegression(solver='liblinear')
model1.fit(X_train_processed,Y_train_enc)
predicted=model1.predict(X_validation_processed)
matrix=confusion_matrix(Y_validation_enc,predicted)
print(matrix)
from sklearn.metrics import classification_report
#model=LogisticRegression(solver='liblinear')
model1.fit(X_train_processed,Y_train_enc)
predicted=model1.predict(X_validation_processed)
report=classification_report(Y_validation_enc,predicted)
print(report)
Spot-checking is a way of discovering which algorithms perform well on your machine learning, as we cannotknow which algorithms are best suited to the problem beforehand. What algorithms should I spot-check on my dataset?
Let's design our test harness. We will use 10-fold cross-validation. The dataset is not too small and this is a good standard test harness configuration. We will evaluate algorithms using the accuracy metric. This is a gross metric that will give a quick idea of how correct a given model is.
# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'accuracy'
Let's create a baseline of performance on this problem and spot-check a number of different algorithms. We will select a suite of different algorithms capable of working on this classification problem. The six algorithms selected include:
# Spot-Check Algorithms
models = []
models.append(('LR',LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('kNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('RDF',RandomForestClassifier()))
The algorithms all use default tuning parameters. Let's compare the algorithms. We will display the mean and standard deviation of accuracy for each algorithm as we calculate it and collect the results for use later.
results = []
names = []
for name, model in models:
kfold =KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_results = cross_val_score(model, X_train_processed, Y_train_enc, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare algorithms
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
from sklearn.metrics import roc_auc_score
RDF_Model = RandomForestClassifier()
RDF_Model.fit(X_train_processed, Y_train_enc)
RDF_Predict = RDF_Model.predict(X_validation_processed)
RDF_Accuracy = accuracy_score(Y_validation_enc, RDF_Predict)
print("Accuracy: " + str(RDF_Accuracy))
RDF_AUC = roc_auc_score(Y_validation_enc, RDF_Predict)
print("AUC: " + str(RDF_AUC))
from sklearn.metrics import roc_auc_score
LR_Model = LogisticRegression(solver='liblinear')
LR_Model.fit(X_train_processed, Y_train_enc)
LR_Predict = LR_Model.predict(X_validation_processed)
LR_Accuracy = accuracy_score(Y_validation_enc, LR_Predict)
print("Accuracy: " + str(LR_Accuracy))
LR_AUC = roc_auc_score(Y_validation_enc, LR_Predict)
print("AUC: " + str(LR_AUC))
from xgboost import XGBClassifier
# fit model on training data
model = XGBClassifier()
model.fit(X_train_processed, Y_train_enc)
# make predictions for test data
predictions = model.predict(X_validation_processed)
# evaluate predictions
accuracy = accuracy_score(Y_validation_enc, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
the results show a tight distribution for RDF,SVM , CART and KNN which is encouraging, suggesting low variance. however, we notice outliers for CART.
# Standardize the dataset
#pipelines = []
#pipelines.append(('ScaledLR',Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression(solver='liblinear'))])))
#pipelines.append(('ScaledLDA',Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])))
#pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])))
#pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])))
#pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])))
#pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()), ('SVM', SVC(gamma='auto'))])))
#results = []
#names = []
#for name, model in pipelines:
# kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
# cv_results = cross_val_score(model, X_train_processed,Y_train_enc, cv=kfold,scoring=scoring)
# results.append(cv_results)
# names.append(name)
# msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
# print(msg)
neighbors = [1,3,5,7,9,11,13,15,17,19,21]
param_grid=dict(n_neighbors=neighbors)
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_processed, Y_train_enc)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('%f (%f) with: %r' % (mean, stdev, param))
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernel_values)
model = SVC(gamma='auto')
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X_train_processed, Y_train_enc)
print('Best: %f using %s' % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print('%f (%f) with: %r' % (mean, stdev, param))
# ensemble
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier(n_estimators=10)))
ensembles.append(('ET', ExtraTreesClassifier(n_estimators=10)))
results = []
names = []
for name, model in ensembles:
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
cv_results = cross_val_score(model, X_train_processed, Y_train_enc, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
print(msg)
# compare Algorithms
fig=pyplot.figure()
fig.suptitle('Ensemble Algorithm Comparison')
ax=fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
The RandomForestClassifier showed the most promise model for this problem. We will finalize the model by training it on the entire training dataset and make predictions for the hold-out validation dataset to confirm our findings
model=RandomForestClassifier()
model.fit(X_train_processed,Y_train_enc)
#estimate accuracy on validation dataset
predictions=model.predict(X_validation_processed)
print(accuracy_score(Y_validation_enc,predictions))
print(confusion_matrix(Y_validation_enc,predictions))
print(classification_report(Y_validation_enc,predictions))